{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Word2Vec: CBOW Model (Continuous Bag of Words)\n",
    "-------------------------------------\n",
    "\n",
    "In this example, we will download and preprocess the movie review data.\n",
    "\n",
    "From this data set we will compute/fit the CBOW model of the Word2Vec Algorithm.\n",
    "\n",
    "We start by loading the necessary libraries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import random\n",
    "import os\n",
    "import pickle\n",
    "import string\n",
    "import requests\n",
    "import collections\n",
    "import io\n",
    "import tarfile\n",
    "import urllib.request\n",
    "from nltk.corpus import stopwords\n",
    "from tensorflow.python.framework import ops\n",
    "ops.reset_default_graph()\n",
    "\n",
    "# Load text helpers\n",
    "import text_helpers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Make a saving directory if it doesn't exist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "data_folder_name = 'temp'\n",
    "if not os.path.exists(data_folder_name):\n",
    "    os.makedirs(data_folder_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Start a computational graph session."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "sess = tf.Session()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Declare model parameters.  Also note that we will save the model every 5000 iterations and print out the validation and loss at specified intervals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "batch_size = 200            # Model Batch Size\n",
    "embedding_size = 50        # word embedding size\n",
    "vocabulary_size = 2000      # Maximum vocabulary size\n",
    "generations = 50000         # number of iterations for training.\n",
    "model_learning_rate = 0.05   # Learning rate\n",
    "\n",
    "num_sampled = int(batch_size/2) # Number of negative examples to sample.\n",
    "window_size = 3                 # How many words to consider left and right.\n",
    "\n",
    "# Add checkpoints to training\n",
    "save_embeddings_every = 5000\n",
    "print_valid_every = 5000\n",
    "print_loss_every = 1000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now we declare stop words, test words, and normalize our text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Data\n",
      "Normalizing Text Data\n",
      "Done.\n"
     ]
    }
   ],
   "source": [
    "# Declare stop words\n",
    "stops = stopwords.words('english')\n",
    "\n",
    "# We pick some test words. We are expecting synonyms to appear\n",
    "valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']\n",
    "# Later we will have to transform these into indices\n",
    "\n",
    "# Load the movie review data\n",
    "print('Loading Data')\n",
    "texts, target = text_helpers.load_movie_data()\n",
    "\n",
    "# Normalize text\n",
    "print('Normalizing Text Data')\n",
    "texts = text_helpers.normalize_text(texts, stops)\n",
    "\n",
    "# Texts must contain at least 3 words\n",
    "target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]\n",
    "texts = [x for x in texts if len(x.split()) > 2]\n",
    "print('Done.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now we build our dataset and word dictionaries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating Dictionary\n"
     ]
    }
   ],
   "source": [
    "# Build our data set and dictionaries\n",
    "print('Creating Dictionary')\n",
    "word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size)\n",
    "word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))\n",
    "text_data = text_helpers.text_to_numbers(texts, word_dictionary)\n",
    "\n",
    "# Get validation word keys\n",
    "valid_examples = [word_dictionary[x] for x in valid_words]    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "We create the CBOW model here.  We also create the placeholders necessary for the CBOW."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating Model\n"
     ]
    }
   ],
   "source": [
    "print('Creating Model')\n",
    "# Define Embeddings:\n",
    "embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "\n",
    "# NCE loss parameters\n",
    "nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],\n",
    "                                               stddev=1.0 / np.sqrt(embedding_size)))\n",
    "nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
    "\n",
    "# Create data/target placeholders\n",
    "x_inputs = tf.placeholder(tf.int32, shape=[batch_size, 2*window_size])\n",
    "y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
    "\n",
    "# Lookup the word embedding\n",
    "# Add together window embeddings:\n",
    "embed = tf.zeros([batch_size, embedding_size])\n",
    "for element in range(2*window_size):\n",
    "    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now we create the loss, optimization function, and the cosine similarity between word vectors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Get loss from prediction\n",
    "loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,\n",
    "                                     biases=nce_biases,\n",
    "                                     labels=y_target,\n",
    "                                     inputs=embed,\n",
    "                                     num_sampled=num_sampled,\n",
    "                                     num_classes=vocabulary_size))\n",
    "\n",
    "# Create optimizer\n",
    "optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate).minimize(loss)\n",
    "\n",
    "# Cosine similarity between words\n",
    "norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
    "normalized_embeddings = embeddings / norm\n",
    "valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n",
    "similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now we create a model saving operation and initialize the model variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Create model saving operation\n",
    "saver = tf.train.Saver({\"embeddings\": embeddings})\n",
    "\n",
    "#Add variable initializer.\n",
    "init = tf.global_variables_initializer()\n",
    "sess.run(init)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need to make sure that all sentences have at least N words, where N is $(2 * WindowSize + 1)$.  So in this case, where window size is 3, we need to use sentences that have at least 7 words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Filter out sentences that aren't long enough:\n",
    "text_data = [x for x in text_data if len(x)>=(2*window_size+1)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now we start the CBOW training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Training\n",
      "Loss at step 1000 : 7.567033767700195\n",
      "Loss at step 2000 : 6.271224498748779\n",
      "Loss at step 3000 : 5.686303615570068\n",
      "Loss at step 4000 : 5.262012481689453\n",
      "Loss at step 5000 : 5.101907253265381\n",
      "Nearest to love: grace, group, perfect, voice, know,\n",
      "Nearest to hate: nostalgia, affection, relies, bits, saving,\n",
      "Nearest to happy: closer, subjects, viewer, rent, una,\n",
      "Nearest to sad: perfectly, characterizations, front, watching, examination,\n",
      "Nearest to man: grandeur, tells, us, uses, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 6000 : 4.944213390350342\n",
      "Loss at step 7000 : 5.152814865112305\n",
      "Loss at step 8000 : 4.832143783569336\n",
      "Loss at step 9000 : 4.74461030960083\n",
      "Loss at step 10000 : 4.894524574279785\n",
      "Nearest to love: grace, group, voice, perfect, know,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, viewer, rent, una,\n",
      "Nearest to sad: perfectly, front, characterizations, watching, examination,\n",
      "Nearest to man: grandeur, tells, uses, always, notice,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 11000 : 4.666254043579102\n",
      "Loss at step 12000 : 4.587408065795898\n",
      "Loss at step 13000 : 4.633434295654297\n",
      "Loss at step 14000 : 4.31603479385376\n",
      "Loss at step 15000 : 4.277889251708984\n",
      "Nearest to love: grace, group, voice, perfect, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, viewer, una, rent,\n",
      "Nearest to sad: perfectly, front, characterizations, watching, examination,\n",
      "Nearest to man: grandeur, tells, uses, always, notice,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 16000 : 4.370548725128174\n",
      "Loss at step 17000 : 4.444838047027588\n",
      "Loss at step 18000 : 4.271615505218506\n",
      "Loss at step 19000 : 4.113015651702881\n",
      "Loss at step 20000 : 4.162801742553711\n",
      "Nearest to love: grace, group, voice, brother, davis,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, viewer, una, rent,\n",
      "Nearest to sad: perfectly, front, characterizations, watching, examination,\n",
      "Nearest to man: grandeur, tells, uses, notice, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 21000 : 4.348406791687012\n",
      "Loss at step 22000 : 4.23916482925415\n",
      "Loss at step 23000 : 3.8844363689422607\n",
      "Loss at step 24000 : 4.245398998260498\n",
      "Loss at step 25000 : 4.052057266235352\n",
      "Nearest to love: group, grace, voice, davis, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, viewer, una, rent,\n",
      "Nearest to sad: perfectly, front, characterizations, examination, watching,\n",
      "Nearest to man: grandeur, tells, uses, notice, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 26000 : 4.114257335662842\n",
      "Loss at step 27000 : 4.162069320678711\n",
      "Loss at step 28000 : 3.936769485473633\n",
      "Loss at step 29000 : 4.292949676513672\n",
      "Loss at step 30000 : 4.095461368560791\n",
      "Nearest to love: group, voice, grace, davis, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, viewer, una, sustain,\n",
      "Nearest to sad: perfectly, characterizations, front, examination, watching,\n",
      "Nearest to man: grandeur, tells, uses, notice, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 31000 : 4.007695198059082\n",
      "Loss at step 32000 : 4.172837734222412\n",
      "Loss at step 33000 : 4.018633842468262\n",
      "Loss at step 34000 : 4.029984951019287\n",
      "Loss at step 35000 : 3.9891326427459717\n",
      "Nearest to love: group, voice, grace, davis, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, una, viewer, sustain,\n",
      "Nearest to sad: perfectly, characterizations, front, examination, watching,\n",
      "Nearest to man: grandeur, tells, uses, notice, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 36000 : 3.6878137588500977\n",
      "Loss at step 37000 : 3.899444580078125\n",
      "Loss at step 38000 : 3.817183256149292\n",
      "Loss at step 39000 : 3.675259590148926\n",
      "Loss at step 40000 : 3.944573163986206\n",
      "Nearest to love: group, voice, grace, davis, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, una, viewer, sustain,\n",
      "Nearest to sad: perfectly, characterizations, front, examination, watching,\n",
      "Nearest to man: grandeur, tells, uses, notice, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 41000 : 3.9233405590057373\n",
      "Loss at step 42000 : 3.8440487384796143\n",
      "Loss at step 43000 : 4.068832874298096\n",
      "Loss at step 44000 : 4.02271842956543\n",
      "Loss at step 45000 : 4.028810024261475\n",
      "Nearest to love: group, voice, davis, grace, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, una, viewer, sustain,\n",
      "Nearest to sad: perfectly, characterizations, front, examination, entertain,\n",
      "Nearest to man: grandeur, tells, notice, uses, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n",
      "Loss at step 46000 : 3.841127395629883\n",
      "Loss at step 47000 : 3.7923529148101807\n",
      "Loss at step 48000 : 3.8246264457702637\n",
      "Loss at step 49000 : 3.6507818698883057\n",
      "Loss at step 50000 : 4.073050498962402\n",
      "Nearest to love: group, voice, davis, grace, brother,\n",
      "Nearest to hate: nostalgia, affection, bits, relies, saving,\n",
      "Nearest to happy: closer, subjects, una, sustain, viewer,\n",
      "Nearest to sad: perfectly, characterizations, front, examination, entertain,\n",
      "Nearest to man: grandeur, tells, notice, uses, always,\n",
      "Nearest to woman: hour, obvious, visual, great, points,\n",
      "Model saved in file: /home/nick/Documents/tensorflow/book_code/07_Natural_Language_Processing/05_Working_With_CBOW_Embeddings/temp/cbow_movie_embeddings.ckpt\n"
     ]
    }
   ],
   "source": [
    "# Run the CBOW model.\n",
    "print('Starting Training')\n",
    "loss_vec = []\n",
    "loss_x_vec = []\n",
    "for i in range(generations):\n",
    "    batch_inputs, batch_labels = text_helpers.generate_batch_data(text_data, batch_size,\n",
    "                                                                  window_size, method='cbow')\n",
    "    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}\n",
    "\n",
    "    # Run the train step\n",
    "    sess.run(optimizer, feed_dict=feed_dict)\n",
    "\n",
    "    # Return the loss\n",
    "    if (i+1) % print_loss_every == 0:\n",
    "        loss_val = sess.run(loss, feed_dict=feed_dict)\n",
    "        loss_vec.append(loss_val)\n",
    "        loss_x_vec.append(i+1)\n",
    "        print('Loss at step {} : {}'.format(i+1, loss_val))\n",
    "      \n",
    "    # Validation: Print some random words and top 5 related words\n",
    "    if (i+1) % print_valid_every == 0:\n",
    "        sim = sess.run(similarity, feed_dict=feed_dict)\n",
    "        for j in range(len(valid_words)):\n",
    "            valid_word = word_dictionary_rev[valid_examples[j]]\n",
    "            top_k = 5 # number of nearest neighbors\n",
    "            nearest = (-sim[j, :]).argsort()[1:top_k+1]\n",
    "            log_str = \"Nearest to {}:\".format(valid_word)\n",
    "            for k in range(top_k):\n",
    "                close_word = word_dictionary_rev[nearest[k]]\n",
    "                log_str = '{} {},' .format(log_str, close_word)\n",
    "            print(log_str)\n",
    "            \n",
    "    # Save dictionary + embeddings\n",
    "    if (i+1) % save_embeddings_every == 0:\n",
    "        # Save vocabulary dictionary\n",
    "        with open(os.path.join(data_folder_name,'movie_vocab.pkl'), 'wb') as f:\n",
    "            pickle.dump(word_dictionary, f)\n",
    "        \n",
    "        # Save embeddings\n",
    "        model_checkpoint_path = os.path.join(os.getcwd(),data_folder_name,'cbow_movie_embeddings.ckpt')\n",
    "        save_path = saver.save(sess, model_checkpoint_path)\n",
    "        print('Model saved in file: {}'.format(save_path))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is some matplotlib code to plot the training loss."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Plot loss over time\n",
    "plt.plot(loss_x_vec, loss_vec, 'k-')\n",
    "plt.title('Training Loss per Generation')\n",
    "plt.xlabel('Generation')\n",
    "plt.ylabel('Loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
